Prerequisites
Load required packages
library(tidyverse)
library(dplyr)
library(ggplot2)
library(rtweet)
library(readr)
library(DataExplorer)
Dataset
Import processed data, which can be found here.
#read preprocessed data
wines <- read.csv(file = '../data/processed_data/wines.csv')
Get sample of dataset
#set seed value to birthday of Ricardo Rodriguez, American wrestler and ring announcer and Dr. Reinaldo (Rei) Sanchez-Arias
set.seed(19630217)
#set percentage to test with for simplicity, if needed
percentage <- 5
wine_sample<- sample_n(wines, percentage/100*nrow(wines))
Split Taster data into different Data Frame
tasters <- wines %>%
select(taster_name, taster_twitter_handle) %>% unique()
tasters
Drop taster_twitter_handle in wines dataframe
wines <- wines %>%
select(-taster_twitter_handle)
head(wines)
Add Reviewer profile info
Each reviewer has there own bias. To offset that we made a “profile” for each reviewer which includes characteristics like: avg_points, sd_points, and var_points
taster_rating_profile <- wines %>%
group_by(taster_name) %>%
summarize(
avg_points = mean(points),
sd_points = sd(points),
var_points = var(points),
reviews = n()
)
tasters <- inner_join(tasters, taster_rating_profile, by = "taster_name")
head(tasters)
Add Rating Classification
Add following classification to wine dataset as found on the website:
| Classic |
98-100 |
The pinnacle of quality. |
| Superb |
94-97 |
A great achievement. |
| Excellent |
90-93 |
Highly recommended. |
| Very Good |
87-89 |
Often good value; well recommended. |
| Good |
83-86 |
Suitable for everyday consumption; often good value. |
| Acceptable |
80-82 |
Can be employed in casual, less-critical circumstances |
# function to add rating
rating_category <- function(points){
if(points>=98){
return("Classic")
}
else if (points>=94){
return("Superb")
}
else if(points>=90){
return("Excellent")
}
else if(points>=87){
return("Very Good")
}
else if(points>=83){
return("Good")
}
else{
return("Acceptable")
}
}
wines<- wines %>%
rowwise() %>%
mutate(rating_category = rating_category(points))
head(wines)
Add Adjusted Points
Since, each reviewer has a different bias we created a normalized metric, norm_points, by looking at the number of standard deviatioins a wine is from the reviewer’s avg_points. This gives use a more accurate representation of which which wines are better than the rest.
normalize_points <- function(data){
left_join(data, tasters, by = "taster_name")%>%
rowwise() %>%
mutate(norm_points = (points-avg_points)/sd_points) %>%
select(-avg_points, -sd_points, -var_points, -taster_twitter_handle, -reviews)
}
wines <- normalize_points(wines)
head(wines)
Data Sanitation
Vintage seems to have year 7200
wines <- wines %>%
filter(vintage<2019)
Data Exploration
Univariate Exploration
Correlation price by points, using DataExplorer library which can be found here
# TODO: IZZY
Alcohol Amount
# TODO: IZZY
Vintage
Count wines per year (Note: Data has been sanitized)
wines %>%
group_by(vintage) %>%
summarize(count = n())
Grouping rowwise data frame strips rowwise nature
wines %>%
ggplot() +
geom_bar(mapping = aes(x=vintage))

Winery
Count the number of wines per winery (in a column graph)
wines %>%
group_by(winery) %>%
summarize(count = n()) %>%
arrange(desc(count)) %>%
slice(1:15) %>%
ggplot() +
geom_col(mapping = aes(x=count, y =winery))
Province
Count the number of wines per province (in a column graph)
wines %>%
group_by(province) %>%
Price
(graphs to understand where the majority of the data is. (in a column graph))
mean_price <- (mean(wines$price, na.rm = TRUE))
sd_price <- (sd(wines$price, na.rm = TRUE))
min_price <- (min(wines$price, na.rm = TRUE))
max_price <- print(max(wines$price, na.rm = TRUE))
[1] 3300
ggplot(mapping = aes(mean_price, sd_price, min_price, max_price))+
geom_boxplot()
Duplicated aesthetics after name standardisation:

Points
(graphs to understand where the majority of the data is. (in a column graph))
print(mean(wines$points))
print(sd(wines$points))
print(min(wines$points))
print(max(wines$points))
Points distribution by Reviewer
wines %>%
ggplot() +
geom_boxplot(aes(y=taster_name, x=points)) +
geom_vline(xintercept = mean(wines$points))
Multivariate Exploration
Price by Points
Notice the data is “stacked” and the socres range from 80-100
wines %>%
ggplot() +
geom_point(mapping = (aes(x = points, y = price)), na.rm = T, alpha = 0.15) +
labs(title = "Price by Points", x = "Points", y = "Price")
TODO: IZZY (Why did we log this?)
wines %>%
ggplot() +
geom_point(mapping = (aes(x = points, y = log(price))), na.rm = T, alpha = 0.15) +
labs(title = "log(Price) by Points", x = "Points", y = "log(Price)")
Data Analysis
#Find the best province for wine using the average points across the 1,000 samples #drop the descriptions or just select price? set points to max(points)
mean_points <- mean(wine_sample$points)
mean_points
best_province <- wine_sample %>%
group_by(points) %>%
filter(points > mean_points) %>%
arrange(desc(points))
best_province
Best wine, by variety
#wine_best_variety <-
wines %>%
group_by(variety) %>%
summarise(mean_points = mean(points)) %>%
arrange(desc(mean_points))
user_price <- readline(prompt = "How much are you willing to spend on a bottle?")
user_price <- as.integer(user_price)
wines %>%
filter(price <= user_price) %>%
arrange(desc(points)) %>%
select(title, price, points)
Conclusion
---
title: "Exploring and Analyizing Wine Enthusiast Reviews"
output: html_notebook
---

# Prerequisites

Load required packages
```{r, message=FALSE, warning=FALSE}
library(tidyverse)
library(dplyr)
library(ggplot2)
library(rtweet)
library(readr)
library(DataExplorer)
```

# Dataset

Import processed data, which can be found [here](https://github.com/C4rbyn3m4n/wine_reviews_data_analysis/blob/master/data/processed_data/preprocessing.rmd).

```{r}
#read preprocessed data
wines <- read.csv(file = '../data/processed_data/wines.csv')
```

Get sample of dataset
```{r}
#set seed value to birthday of Ricardo Rodriguez, American wrestler and ring announcer and Dr. Reinaldo (Rei) Sanchez-Arias
set.seed(19630217)

#set percentage to test with for simplicity, if needed
percentage <- 5
wine_sample<- sample_n(wines, percentage/100*nrow(wines))
```

### Split Taster data into different Data Frame

```{r}
tasters <- wines %>%
  select(taster_name, taster_twitter_handle) %>% unique()
tasters
```

Drop `taster_twitter_handle` in wines dataframe

```{r}
wines <- wines %>%
  select(-taster_twitter_handle)
head(wines)
```
## Add Reviewer profile info

Each reviewer has there own bias. To offset that we made a "profile" for each reviewer which includes characteristics like: `avg_points`, `sd_points`, and `var_points`
```{r}
taster_rating_profile <- wines %>%
  group_by(taster_name) %>%
  summarize(
    avg_points = mean(points),
    sd_points = sd(points),
    var_points = var(points),
    reviews = n()
  )

tasters <- inner_join(tasters, taster_rating_profile, by = "taster_name")
head(tasters)
```
### Add Rating Classification

Add following classification to wine dataset as found on the [website](https://www.winemag.com/2010/04/09/you-asked-how-is-a-wines-score-determined/):

|Category  | Rating  | Description                                            |
|----------|---------|--------------------------------------------------------|
|Classic   |	98-100 | The pinnacle of quality.                               |
|Superb    |	94-97	 | A great achievement.                                   |
|Excellent |	90-93	 | Highly recommended.                                    |
|Very Good |  87-89	 | Often good value; well recommended.                    |
|Good	     |  83-86	 | Suitable for everyday consumption; often good value.   |
|Acceptable|	80-82	 | Can be employed in casual, less-critical circumstances |

```{r}
# function to add rating
rating_category <- function(points){
  if(points>=98){
    return("Classic")
  }
  else if (points>=94){
    return("Superb")
  }
  else if(points>=90){
    return("Excellent")
  }
  else if(points>=87){
    return("Very Good")
  }
  else if(points>=83){
    return("Good")
  }
  else{
    return("Acceptable")
  }
}

wines<- wines %>%
  rowwise() %>%
  mutate(rating_category = rating_category(points))
head(wines)
```

## Add Adjusted Points

Since, each reviewer has a different bias we created a normalized metric, `norm_points`, by looking at the number of standard deviatioins a wine is from the reviewer's `avg_points`. This gives use a more accurate representation of which which wines are better than the rest.

```{r}
normalize_points <- function(data){
  left_join(data, tasters, by = "taster_name")%>%
    rowwise() %>%
    mutate(norm_points = (points-avg_points)/sd_points) %>%
    select(-avg_points, -sd_points, -var_points, -taster_twitter_handle, -reviews)
}

wines <- normalize_points(wines)
head(wines) 
```

## Data Sanitation

Vintage seems to have year 7200
``` {r}
wines <- wines %>%
  filter(vintage<2019)
```
# Data Exploration

## Univariate Exploration
Correlation `price` by `points`, using ```DataExplorer``` library which can be found [here](https://datascienceplus.com/blazing-fast-eda-in-r-with-dataexplorer/)
```{r}
# TODO: IZZY
```

### Alcohol Amount
```{r}
# TODO: IZZY
```

### Category
```{r}
# TODO: IZZY
```

### Vintage
Count wines per year (Note: Data has been sanitized)
```{r}
wines %>%
  group_by(vintage) %>%
  summarize(count = n())
```


```{r}
wines %>%
  ggplot() +
  geom_bar(mapping = aes(x=vintage))
```

### Winery
Count the number of wines per winery (in a column graph)
```{r}
wines %>%
  group_by(winery) %>%
  summarize(count = n()) %>%
  arrange(desc(count)) %>%
  slice(1:15) %>%
  ggplot() +
  geom_col(mapping = aes(x=count, y =winery)) 
```

### Province
Count the number of wines per province (in a column graph)
```{r}
wines %>% 
  group_by(province) %>% 
  summarize(count = n()) %>% 
  arrange(desc(count)) %>% 
  slice(1:10) %>% 
  ggplot()+
  geom_bar(mapping = aes(y = province, x = count))
```

### Price
(graphs to understand where the majority of the data is. (in a column graph))
```{r}
mean_price <- (mean(wines$price, na.rm = TRUE))
sd_price <- (sd(wines$price, na.rm = TRUE))
min_price <- (min(wines$price, na.rm = TRUE))
max_price <- print(max(wines$price, na.rm = TRUE))

ggplot(mapping = aes(mean_price, sd_price, min_price, max_price))+
  geom_boxplot()
```

### Points 
(graphs to understand where the majority of the data is. (in a column graph))
```{r}
print(mean(wines$points))
print(sd(wines$points))
print(min(wines$points))
print(max(wines$points))
```


Points distribution by Reviewer
```{r}
wines %>%
  ggplot() +
  geom_boxplot(aes(y=taster_name, x=points)) +
  geom_vline(xintercept = mean(wines$points))
```

## Multivariate Exploration

## Price by Points
Notice the data is "stacked" and the socres range from 80-100
```{r}
wines %>% 
  ggplot() +
  geom_point(mapping = (aes(x = points, y = price)), na.rm = T, alpha = 0.15) +
  labs(title = "Price by Points", x = "Points", y = "Price")
```

TODO: IZZY (Why did we log this?)

```{r}
wines %>% 
  ggplot() +
  geom_point(mapping = (aes(x = points, y = log(price))), na.rm = T, alpha = 0.15) +
  labs(title = "log(Price) by Points", x = "Points", y = "log(Price)")
```

# Data Analysis

#Find the best province for wine using the average points across the 1,000 samples
#drop the descriptions or just select price? set points to max(points)
```{r}
mean_points <- mean(wine_sample$points)
mean_points

best_province <- wine_sample %>% 
  group_by(points) %>% 
  filter(points > mean_points) %>% 
  arrange(desc(points))
best_province
```


Best wine, by variety
```{r}
#wine_best_variety <- 
wines %>% 
  group_by(variety) %>% 
  summarise(mean_points = mean(points)) %>% 
  arrange(desc(mean_points)) 
  
```

```{r}
user_price <- readline(prompt = "How much are you willing to spend on a bottle?")
user_price <- as.integer(user_price)

wines %>% 
  filter(price <= user_price) %>% 
  arrange(desc(points)) %>% 
  select(title, price, points)
```


# Conclusion
